R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

dat_path = "~/research/data/MorPhiC/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")

Read in meta data

meta_flat = read_excel("data/JAX_RNAseq_ExtraEmbryonic_meta_falttened.xlsx")
meta_flat = as.data.frame(meta_flat)
dim(meta_flat)
## [1] 90 37
meta_flat[1:2,]
##                                                                                                  Dataset
## 1 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## 2 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
##                                          Counts file
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002
##                                                 Sequence file read 1
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R1_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R1_001.fastq.gz
##                                                 Sequence file read 2
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R2_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R2_001.fastq.gz
##   INPUT LIBRARY PREPARATION ID                 RUN ID
## 1                   GT23-11450 20230918_23-robson-010
## 2                   GT23-11449 20230918_23-robson-010
##   LIBRARY AVERAGE FRAGMENT SIZE LIBRARY INPUT AMOUNT VALUE
## 1                           406                        300
## 2                           402                        300
##   LIBRARY INPUT AMOUNT UNIT LIBRARY FINAL YIELD VALUE LIBRARY FINAL YIELD UNIT
## 1                       ngs                     168.8                      ngs
## 2                       ngs                     199.6                      ngs
##   LIBRARY CONCENTRATION VALUE LIBRARY CONCENTRATION UNIT LIBRARY PCR CYCLES
## 1                        47.2                         nM                 10
## 2                        66.8                         nM                 10
##   LIBRARY PCR CYCLES FOR SAMPLE INDEX DIFFERENTIATED CELL LINE ID
## 1                                  NA           PrS-MOK20026W-H02
## 2                                  NA           PrS-MOK20026W-G02
##                                                       DIFFERENTIATED CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
##   TIMEPOINT TIME-POINT UNIT TERMINALLY DIFFERENTIATED?
## 1         6            days                        Yes
## 2         6            days                        Yes
##                                Model System INPUT CELL LINE ID CLONE ID
## 1 extra-embryonic primitive syncytial cells      MOK20026W-H02      H02
## 2 extra-embryonic primitive syncytial cells      MOK20026W-G02      G02
##    CELL LINE ID CELL LINE TYPE DERIVED FROM CELL LINE NAME
## 1 MOK20026W-H02           iPSC                    KOLF2.2J
## 2 MOK20026W-G02           iPSC                    KOLF2.2J
##                                          CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
##   ALTERED GENE SYMBOLS ALTERATION TYPE TARGETED GENOMIC REGION
## 1                PPARG        deletion full coding region (KO)
## 2                PPARG        deletion full coding region (KO)
##   TARGETED GENOMIC REGION COORDINATES                            GUIDE SEQUENCE
## 1              chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## 2              chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
##   ZYGOSITY GENE EXPRESSION ALTERATION PROTOCOL ID
## 1      N.A                               JAXPE001
## 2      N.A                               JAXPE001
##   LIBRARY PREPARATION PROTOCOL ID SEQUENCING PROTOCOL ID
## 1                        JAXPL001               JAXPS001
## 2                        JAXPL001               JAXPS001
##   DIFFERENTIATION PROTOCOL ID
## 1                    JAXPD001
## 2                    JAXPD001
names(meta_flat)
##  [1] "Dataset"                               
##  [2] "Counts file"                           
##  [3] "Sequence file read 1"                  
##  [4] "Sequence file read 2"                  
##  [5] "INPUT LIBRARY PREPARATION ID"          
##  [6] "RUN ID"                                
##  [7] "LIBRARY AVERAGE FRAGMENT SIZE"         
##  [8] "LIBRARY INPUT AMOUNT VALUE"            
##  [9] "LIBRARY INPUT AMOUNT UNIT"             
## [10] "LIBRARY FINAL YIELD VALUE"             
## [11] "LIBRARY FINAL YIELD UNIT"              
## [12] "LIBRARY CONCENTRATION VALUE"           
## [13] "LIBRARY CONCENTRATION UNIT"            
## [14] "LIBRARY PCR CYCLES"                    
## [15] "LIBRARY PCR CYCLES FOR SAMPLE INDEX"   
## [16] "DIFFERENTIATED CELL LINE ID"           
## [17] "DIFFERENTIATED CELL LINE DESCRIPTION"  
## [18] "TIMEPOINT"                             
## [19] "TIME-POINT UNIT"                       
## [20] "TERMINALLY DIFFERENTIATED?"            
## [21] "Model System"                          
## [22] "INPUT CELL LINE ID"                    
## [23] "CLONE ID"                              
## [24] "CELL LINE ID"                          
## [25] "CELL LINE TYPE"                        
## [26] "DERIVED FROM CELL LINE NAME"           
## [27] "CELL LINE DESCRIPTION"                 
## [28] "ALTERED GENE SYMBOLS"                  
## [29] "ALTERATION TYPE"                       
## [30] "TARGETED GENOMIC REGION"               
## [31] "TARGETED GENOMIC REGION COORDINATES"   
## [32] "GUIDE SEQUENCE"                        
## [33] "ZYGOSITY"                              
## [34] "GENE EXPRESSION ALTERATION PROTOCOL ID"
## [35] "LIBRARY PREPARATION PROTOCOL ID"       
## [36] "SEQUENCING PROTOCOL ID"                
## [37] "DIFFERENTIATION PROTOCOL ID"

Gather cell line id information

meta_flat$`DIFFERENTIATED CELL LINE ID`[1:5]
## [1] "PrS-MOK20026W-H02" "PrS-MOK20026W-G02" "PrS-MOK20026W-E02"
## [4] "PrS-MOK20026P-E12" "PrS-MOK20026P-D10"
cell_line_id = str_split(meta_flat$`DIFFERENTIATED CELL LINE ID`, pattern = '-')
table(sapply(cell_line_id, length))
## 
##  3  4 
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
dim(cell_line_id)
## [1] 90  4
names(cell_line_id) = c("model_system", "cell_line", "clone_id", "condition")
cell_line_id[1:2,]
##   model_system cell_line clone_id condition
## 1          PrS MOK20026W      H02      <NA>
## 2          PrS MOK20026W      G02      <NA>
cell_line_id$condition[which(is.na(cell_line_id$condition))] = "nor"

table(meta_flat$`Model System`, cell_line_id$model_system)
##                                            
##                                             ExM PrS
##   extra-embryonic mesenchymal cells          12   0
##   extra-embryonic primitive syncytial cells   0  78
table(cell_line_id$model_system, cell_line_id$condition, useNA="ifany")
##      
##       hyp nor
##   ExM   0  12
##   PrS  12  66

Add run_id and ko_strategy.

w2update = which(meta_flat$`RUN ID` == "20231004_23-robson-008-run2")
meta_flat$run_id = meta_flat$`RUN ID`
meta_flat$run_id[w2update] = "20230918_23-robson-008"

table(meta_flat$run_id, cell_line_id$model_system, useNA="ifany")
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008        0  24
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011       12   0
table(meta_flat$run_id, cell_line_id$condition, useNA="ifany")
##                              
##                               hyp nor
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008       12  12
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011        0  12
table(meta_flat$`TARGETED GENOMIC REGION`, useNA="ifany")
## 
##                critical exon (CE)           full coding region (KO) 
##                                24                                24 
## premature termination codon (PTC)                              <NA> 
##                                24                                18
meta_flat$ko_strategy = str_extract(meta_flat$`TARGETED GENOMIC REGION`, 
                                    "\\(\\S+\\)$")
meta_flat$ko_strategy = str_remove_all(meta_flat$ko_strategy, "\\(|\\)")
meta_flat$ko_strategy[which(is.na(meta_flat$ko_strategy))] = "WT"

table(meta_flat$`TARGETED GENOMIC REGION`, meta_flat$ko_strategy, 
      useNA="ifany")
##                                    
##                                     CE KO PTC WT
##   critical exon (CE)                24  0   0  0
##   full coding region (KO)            0 24   0  0
##   premature termination codon (PTC)  0  0  24  0
##   <NA>                               0  0   0 18
meta_flat$ko_gene = meta_flat$`ALTERED GENE SYMBOLS`
meta_flat$ko_gene[which(is.na(meta_flat$ko_gene))] = "WT"

table(cell_line_id$model_system, meta_flat$ko_gene)
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   ExM     0    0    0     0    9      0     0  3
##   PrS    18    9    9     9    0      9     9 15
table(meta_flat$run_id, meta_flat$ko_gene)
##                              
##                               EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   20230809_23-robson-005-run2     0    9    0     9    0      9     0  3
##   20230918_23-robson-008         18    0    0     0    0      0     0  6
##   20230918_23-robson-009          0    0    9     0    0      0     0  3
##   20230918_23-robson-010          0    0    0     0    0      0     9  3
##   20231004_23-robson-011          0    0    0     0    9      0     0  3
table(meta_flat$ko_strategy, meta_flat$ko_gene)
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   CE      6    3    3     3    3      3     3  0
##   KO      6    3    3     3    3      3     3  0
##   PTC     6    3    3     3    3      3     3  0
##   WT      0    0    0     0    0      0     0 18

Read in gene count data and filter genes

meta = cbind(meta_flat, cell_line_id)
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674                                                   0
## 2 ENSG00000271254                                                1030
##   PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1                                                      0
## 2                                                    489
##   PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1                                                      0
## 2                                                    603
setequal(names(cts)[-1], meta$`Counts file`)
## [1] TRUE
meta = meta[match(names(cts)[-1], meta$`Counts file`),]
table(names(cts)[-1] == meta$`Counts file`)
## 
## TRUE 
##   90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0        0.0        0.0    0.00
## ENSG00000271254 ENSG00000271254     634     387      812.5      775.5  848.25
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674       0       0       3
## ENSG00000271254     923     948    1169
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.0  
##  Mean   :   376.3   Mean   :   255.1   Mean   :   530.5   Mean   :   585.1  
##  3rd Qu.:   120.0   3rd Qu.:    60.0   3rd Qu.:   188.5   3rd Qu.:   182.5  
##  Max.   :512761.0   Max.   :154723.0   Max.   :797550.5   Max.   :379480.0  
##     ExM_q75            PrS_q75            ExM_max            PrS_max      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     1  
##  Median :     3.0   Median :     3.0   Median :     6.0   Median :    10  
##  Mean   :   608.8   Mean   :   712.7   Mean   :   751.6   Mean   :  1102  
##  3rd Qu.:   228.0   3rd Qu.:   233.8   3rd Qu.:   285.0   3rd Qu.:   405  
##  Max.   :886970.0   Max.   :456738.2   Max.   :928104.0   Max.   :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12757   959
##   TRUE   2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25835    90
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Condition") + 
  scale_color_brewer(palette = "Set1")

Check the effect of hypoxia vs. normoxia among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   361.0   422.2   469.8   478.4   540.5   592.0
stopifnot(all(meta_m$file_id == names(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 41.3 18.4  8.2  4.6  3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="KO gene", shape ="KO strategy") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

Check the impact of run_id and hypoxia vs. normoxia effect on WT

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="KO gene", shape ="KO strategy") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

table(meta_m$run_id, meta_m$model_system)
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0   3
##   20230918_23-robson-008        0   6
##   20230918_23-robson-009        0   3
##   20230918_23-robson-010        0   3
##   20231004_23-robson-011        3   0

DE analysis with respect to model system and condition (hypoxia vs. normoxia)

## Generate sample information matrix

cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]

dim(sample_info)
## [1] 18  3
sample_info[1:2,]
##    model_system condition   q75
## 12          PrS       nor 569.5
## 68          PrS       hyp 545.0
sample_info$log_q75 = log(sample_info$q75)

dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                  ~ model_system + condition + log_q75)
dds = DESeq(dds)

## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
## [1] 25835     6
res_rd[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat    pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.73329722 0.4633772
##                      padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))
## 
## FALSE  TRUE 
## 17751  8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Read depth")
print(g0)

## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)

res_model_system = as.data.frame(res_lrt)
dim(res_model_system)
## [1] 25835     6
res_model_system[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.04552192 0.83104721
##                       padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))
## 
## FALSE  TRUE 
## 23736  2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Model system")
print(g0)

## association with condition
dds_lrt = DESeq(dds, test="LRT", reduced = ~ model_system + log_q75)
res_lrt = results(dds_lrt)

res_condition = as.data.frame(res_lrt)
dim(res_condition)
## [1] 25835     6
res_condition[1:2,]
##                  baseMean log2FoldChange     lfcSE      stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.2926949 0.58849873
## ENSG00000277196  46.50349      0.9817827 1.3388605 2.9420549 0.08630088
##                      padj
## ENSG00000271254 0.6952827
## ENSG00000277196 0.1494475
table(is.na(res_condition$padj))
## 
## FALSE  TRUE 
## 22739  3096
g0 = ggplot(res_condition %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Condition")
print(g0)

Analyze samples of different model system or conditions

meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)
## 
## ExM_nor PrS_hyp PrS_nor 
##      12      12      66
table(meta$run_id, meta$model_condition)
##                              
##                               ExM_nor PrS_hyp PrS_nor
##   20230809_23-robson-005-run2       0       0      30
##   20230918_23-robson-008            0      12      12
##   20230918_23-robson-009            0       0      12
##   20230918_23-robson-010            0       0      12
##   20231004_23-robson-011           12       0       0
for(model1 in unique(meta$model_condition)){
  print(model1)
  sample2kp = which(meta$model_condition == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  table(meta_m$ko_strategy, substr(colnames(cts_dat_m), 1, 7))
  stopifnot(all(meta_m$file_id == names(cts_dat_m)))
  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(model1) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(model1) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  table(meta_m$run_id, meta_m$ko_gene)
  
  ## Generate sample information matrix
  
  cols2kp = c("ko_strategy", "ko_gene", "run_id", "q75")
  sample_info = meta_m[,cols2kp]
  
  dim(sample_info)
  sample_info[1:2,]
  
  table(sample_info$ko_strategy, sample_info$ko_gene)
  
  sample_info$ko_group = paste(sample_info$ko_gene, sample_info$ko_strategy, sep="_")
  table(sample_info$ko_group)

  sample_info$log_q75 = log(sample_info$q75)
  
  if(model1 == "PrS_nor"){
    dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                    ~ ko_group + run_id + log_q75)
  }else{
    dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                    ~ ko_group + log_q75)
  }
  
  dds = DESeq(dds)
  
  ## association with read-depth
  res_rd = results(dds, name = "log_q75")
  res_rd = as.data.frame(res_rd)
  dim(res_rd)
  res_rd[1:2,]
  
  table(is.na(res_rd$padj))
  
  g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
    geom_histogram(color="darkblue", fill="lightblue", 
                   breaks = seq(0,1,by=0.01)) + 
    ggtitle(paste0(model1, " read depth"))
  print(g0)
  
  ## association with run_id
  if(model1 == "PrS_nor"){
    dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
    res_lrt = results(dds_lrt)

    res_run_id = as.data.frame(res_lrt)
    dim(res_run_id)
    res_run_id[1:2,]
  
    table(is.na(res_run_id$padj))
    
    g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(model1, " Run ID"))
    print(g0)
  }
  
  ## DE analysis for each KO gene
  table(sample_info$ko_gene)
  
  genos = setdiff(unique(sample_info$ko_gene), "WT")
  genos
  
  ko_grp  = c("CE", "KO", "PTC")
  res_geno_df = NULL
    
  for(geno in genos){
    
    res_geno = list()
    
    for(k_grp1 in ko_grp){
      res_g1 = results(dds, contrast = c("ko_group", 
                                         paste0(geno, "_", k_grp1), "WT_WT"))
      res_g1 = signif(data.frame(res_g1), 3)
      res_geno[[k_grp1]] = res_g1
      
      g1 = ggplot(res_g1 %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(geno, "_", k_grp1))
      print(g1)

      tag1 = sprintf("%s_%s_%s_", model1, geno, k_grp1)
      colnames(res_g1) = paste0(tag1, colnames(res_g1))
      
      if(is.null(res_geno_df)){
        res_geno_df = res_g1
      }else if(!is.null(res_geno_df)){
        stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
        res_geno_df = cbind(res_geno_df, res_g1)
      }
    }
    
    get_index <- function(x){
      which(x$padj < 0.01 & abs(x$log2FoldChange) > log2(1.5))
    }
    
    w_ce = get_index(res_geno[["CE"]])
    w_ko = get_index(res_geno[["KO"]])
    w_ptc = get_index(res_geno[["PTC"]])

    m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
              "KO" = rownames(res_geno[["KO"]])[w_ko], 
              "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
    g_up = UpSet(m)
    
    print(g_up)
    
    df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                     padj_KO = res_geno[["KO"]]$padj, 
                     padj_PTC = res_geno[["PTC"]]$padj)

    cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
    cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
    
    g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                          y = -log10(padj_CE))) +
      geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr1)) + 
      scale_color_viridis()
    
    g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                          y = -log10(padj_KO))) +
      geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr1)) + 
      scale_color_viridis()
    
    print(g4)
    print(g5)
  }
  
  dim(res_geno_df)
  res_geno_df[1:2,]
  
  res_df = data.frame(gene_ID=rownames(res_geno_df), res_geno_df)
  dim(res_df)
  res_df[1:2,]

  fwrite(res_df, sprintf("results/%s_DEseq2.tsv", model1), sep="\t")
}
## [1] "PrS_nor"

## [1] "PrS_hyp"

## [1] "ExM_nor"

gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  7735622 413.2   15844260 846.2         NA 15844260 846.2
## Vcells 29437967 224.6   95670713 730.0      65536 95670713 730.0
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.2                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.14.8          
##  [5] dplyr_1.1.2                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.2               viridisLite_0.4.1          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.0.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-58.2              
## [21] stringr_1.5.0               ggpubr_0.6.0               
## [23] ggrepel_0.9.3               ggplot2_3.4.2              
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.1-0       ggsignif_0.6.4         rjson_0.2.21          
##  [4] circlize_0.4.15        XVector_0.38.0         GlobalOptions_0.1.2   
##  [7] clue_0.3-65            rstudioapi_0.14        farver_2.1.1          
## [10] bit64_4.0.5            AnnotationDbi_1.60.2   fansi_1.0.4           
## [13] codetools_0.2-19       doParallel_1.0.17      cachem_1.0.7          
## [16] geneplotter_1.76.0     knitr_1.44             jsonlite_1.8.4        
## [19] broom_1.0.4            annotate_1.76.0        cluster_2.1.4         
## [22] png_0.1-8              compiler_4.2.3         httr_1.4.6            
## [25] backports_1.4.1        Matrix_1.6-4           fastmap_1.1.1         
## [28] cli_3.6.1              htmltools_0.5.5        tools_4.2.3           
## [31] gtable_0.3.3           glue_1.6.2             GenomeInfoDbData_1.2.9
## [34] Rcpp_1.0.10            carData_3.0-5          cellranger_1.1.0      
## [37] jquerylib_0.1.4        vctrs_0.6.2            Biostrings_2.66.0     
## [40] iterators_1.0.14       xfun_0.39              lifecycle_1.0.3       
## [43] rstatix_0.7.2          XML_3.99-0.14          zlibbioc_1.44.0       
## [46] scales_1.2.1           parallel_4.2.3         yaml_2.3.7            
## [49] memoise_2.0.1          gridExtra_2.3          sass_0.4.5            
## [52] stringi_1.7.12         RSQLite_2.3.1          foreach_1.5.2         
## [55] BiocParallel_1.32.6    shape_1.4.6            rlang_1.1.0           
## [58] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.20         
## [61] lattice_0.20-45        purrr_1.0.1            labeling_0.4.2        
## [64] bit_4.0.5              tidyselect_1.2.0       plyr_1.8.8            
## [67] magrittr_2.0.3         R6_2.5.1               generics_0.1.3        
## [70] DelayedArray_0.24.0    DBI_1.1.3              pillar_1.9.0          
## [73] withr_2.5.0            KEGGREST_1.38.0        abind_1.4-5           
## [76] RCurl_1.98-1.12        tibble_3.2.1           crayon_1.5.2          
## [79] car_3.1-2              utf8_1.2.3             rmarkdown_2.21        
## [82] GetoptLong_1.0.5       locfit_1.5-9.8         blob_1.2.4            
## [85] digest_0.6.31          xtable_1.8-4           tidyr_1.3.0           
## [88] munsell_0.5.0          bslib_0.4.2
renv::snapshot(lockfile = "step2_DE_testing.lock")
## - The lockfile is already up to date.